library(knitr)
# data url =http://bit.ly/CarreFourSalesDataset
sales<- read.csv('http://bit.ly/CarreFourSalesDataset',na.strings = "")
# printing the top six rows of the dataset
head(sales)
## Date Sales
## 1 1/5/2019 548.9715
## 2 3/8/2019 80.2200
## 3 3/3/2019 340.5255
## 4 1/27/2019 489.0480
## 5 2/8/2019 634.3785
## 6 3/25/2019 627.6165
# Previewing the datatypes of our data
str(sales)
## 'data.frame': 1000 obs. of 2 variables:
## $ Date : chr "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Sales: num 549 80.2 340.5 489 634.4 ...
#checking the size/shape of a data frame
dim(sales)
## [1] 1000 2
library(anomalize)
## == Use anomalize to improve your Forecasts by 50%! =============================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tibbletime)
##
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
##
## filter
library(timetk)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks tibbletime::filter(), stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
# Checking for missing values
is.null(sales)
## [1] FALSE
There are no missing values.
# checking for duplicates
anyDuplicated(sales)
## [1] 0
There are no duplicated values.
# Checking for outliers
#boxplot(sales)
Few outliers have been detected in the sales column
# changing date column to date type
sales$Date <- as.Date(sales$Date, format = "%m/%d/%y")
sales$Date <- as.POSIXct(sales$Date)
# Changing the dataframe to tibble
df <- as_tibble(sales)
class(df)
## [1] "tbl_df" "tbl" "data.frame"
# Previewing our tibble
head(df)
## # A tibble: 6 x 2
## Date Sales
## <dttm> <dbl>
## 1 2020-01-05 03:00:00 549.
## 2 2020-03-08 03:00:00 80.2
## 3 2020-03-03 03:00:00 341.
## 4 2020-01-27 03:00:00 489.
## 5 2020-02-08 03:00:00 634.
## 6 2020-03-25 03:00:00 628.
# Checking for any null values
is.null(df)
## [1] FALSE
df <- na.omit(df)
# Using timetk to detect and visualize any anomalies.
df %>% timetk::plot_anomaly_diagnostics(Date,Sales, .facet_ncol = 2)
## frequency = 11 observations per 1 hour
## trend = 20 observations per 12 hours
# To find the exact data points that are anomalies, we use tk_anomaly_diagnostics() function.
df <- df %>% timetk::tk_anomaly_diagnostics(Date,Sales) %>% filter(anomaly=='Yes')
## frequency = 11 observations per 1 hour
## trend = 20 observations per 12 hours
df
## # A tibble: 0 x 11
## # ... with 11 variables: Date <dttm>, observed <dbl>, season <dbl>,
## # trend <dbl>, remainder <dbl>, seasadj <dbl>, remainder_l1 <dbl>,
## # remainder_l2 <dbl>, anomaly <chr>, recomposed_l1 <dbl>, recomposed_l2 <dbl>
The frequency is 11 observations per hour and a trend of 20 observations per 12 hours. The we conclude that there were no no anomalies on the sales trends.
# Using the anomalized package to detect anomalies.
#df_anomalized <- df %>%
# time_decompose(Sales, merge = TRUE) %>%
# anomalize(remainder) %>%
# # time_recompose()
#df_anomalized %>% glimpse()